
###########################################################
##### Haiti elite network project  		          			#####
##### clean genealogical family data	             		#####
##### 2021 mar 03                   									#####
###########################################################


## read in list of names and types from trade data
## create last name variable
## match list of comp names to corp data
## match list of ind names to genealogy
## match list of ind names to pol bios
## match list of ind names to mil bios


#####
## basic genealogy cleaning
#####

gen <- read.csv("01_Data/01_Raw/03_Family/haiti_gene_041613.csv",as.is=T)
dim(gen)

colnames(gen) <- tolower(colnames(gen))


gen$name <- gsub("???","?",gen$name,fixed=T)
gen$name <- gsub("??","?",gen$name,fixed=T)
gen$name <- gsub("??","?",gen$name,fixed=T)
gen$name <- gsub("??","?",gen$name,fixed=T)


## replace "NONE" and "?" with "NA"

for (i in 1:dim(gen)[2]){
  gen[,i] <- ifelse(gen[,i]=="NONE",NA,gen[,i])
  gen[,i] <- ifelse(gen[,i]=="",NA,gen[,i])
}



## separate out last name

  # add spaces to beginning and end of lines

gen$last <- paste0(" ", gen$name, " ")
gen$last <- gsub("  ", " ", gen$last)
gen$last <- gsub("   ", " ", gen$last)

  # delete middle initials and roman numerals

gen$last <- gsub("DanieI", "Daniel", gen$last)
gen$last <- gsub("AA GARDERE", "GARDERE", gen$last)
gen$last <- gsub("P-Y", "", gen$last)
gen$last <- gsub("E-C", "", gen$last)
gen$last <- gsub("P-C", "", gen$last)
gen$last <- gsub("LucasS", "Lucas", gen$last)
gen$last <- gsub("JB.", "", gen$last)
gen$last <- gsub("J-P ", "", gen$last)
gen$last <- gsub("A-M ", "", gen$last)
gen$last <- gsub("J-L.", "", gen$last)
gen$last <- gsub(". V. M.", "", gen$last)
gen$last <- gsub("J-P.", "", gen$last)
gen$last <- gsub("JKL ", "", gen$last)
gen$last <- gsub("HG. ", "", gen$last)
gen$last <- gsub("?", " ", gen$last, fixed = T)
gen$last <- gsub("YvesII", "Yves", gen$last)
gen$last <- gsub("[ |.][A-Z]{0,1}[ |.]{1,}", " ", gen$last)
gen$last <- gsub("[ |.][A-Z]{0,1}[ |.]{1,}", " ", gen$last)
gen$last <- gsub("[ |.][I]{0,4}[ |.]{1,}", " ", gen$last)
gen$last <- gsub("[ |.][I][V][ |.]{1,}", " ", gen$last)
gen$last <- gsub("[']", "", gen$last)


  # take out last names that are all caps

gen$last <- str_extract(string = gen$last, pattern = "([ ]*[A-Z]*[ ]*[A-Z]*[ ]*[A-Z]*[-| ][A-Z]{2,})")

# delete white spaces

gen$last <- gsub("^ +", "", gen$last)
gen$last <- gsub(" *$", "", gen$last)

# replace missing

gen$last[is.na(gen$last)==T] <- "NA"


## write unique last names to csv

write.csv(sort(unique(gen$last)), "01_Data/02_Clean/gen_last.csv")


## make some stuff numbers numbers

colnames(gen)[13] <- 'ref'
gen$ref <- as.numeric(gsub("#","",gen$ref))

gen$deces_y <- as.numeric(gen$deces_y)
gen$naissance_y <- as.numeric(gen$naissance_y)
gen$marriage1_y <- as.numeric(gen$marriage1_y)
gen$marriage2_y <- as.numeric(gen$marriage2_y)
gen$marriage3_y <- as.numeric(gen$marriage3_y)
gen$marriage4_y <- as.numeric(gen$marriage4_y)
gen$marriage5_y <- as.numeric(gen$marriage5_y)
gen$marriage6_y <- as.numeric(gen$marriage6_y)
gen$marriage7_y <- as.numeric(gen$marriage7_y)
gen$marriage8_y <- as.numeric(gen$marriage8_y)
gen$marriage9_y <- as.numeric(gen$marriage9_y)
gen$marriage10_y <- as.numeric(gen$marriage10_y)

gen$deces_y[gen$deces_y<1000] <- NA
gen$naissance_y[gen$naissance_y<1000] <- NA
gen$marriage1_y[gen$marriage1_y<1000] <- NA
gen$marriage2_y[gen$marriage2_y<1000] <- NA
gen$marriage3_y[gen$marriage3_y<1000] <- NA
gen$marriage4_y[gen$marriage4_y<1000] <- NA
gen$marriage5_y[gen$marriage5_y<1000] <- NA
gen$marriage6_y[gen$marriage6_y<1000] <- NA
gen$marriage7_y[gen$marriage7_y<1000] <- NA
gen$marriage8_y[gen$marriage8_y<1000] <- NA
gen$marriage9_y[gen$marriage9_y<1000] <- NA
gen$marriage10_y[gen$marriage10_y<1000] <- NA


## delete if no is missing or duplicated

dim(gen)
gen <- subset(gen,duplicated(gen$no)==F & is.na(gen$no)==F)
dim(gen)


#####
## cohort
#####

## generate cohort variable for time period based on birth date

age_m <- 25
gen$cohort <- ifelse(is.na(gen$naissance_y)==F, gen$naissance_y, gen$marriage1_y-age_m)

gen$cohort_type <- ifelse(is.na(gen$cohort)==F,0,NA)

gen2 <- gen

## run this multiple times until you get a good number of non-missing cohorts

reps = 7

for (k in 1:reps) {

## create shortened frame of cohort info

temp <- subset(gen2,select=c(no,cohort))
length(na.omit(temp$cohort))/length(na.omit(temp$no))
    
    # number of obs with cohort from own info 22%

## merge short frame with orig for spouses and siblings

cols<- c('cohort_sp1','cohort_sp2','cohort_sp3','cohort_sp4','cohort_sp5',
         'cohort_sp6','cohort_sp7','cohort_sp8','cohort_sp9','cohort_sp10',
         'cohort_f1','cohort_f2','cohort_f3','cohort_f4',
         'cohort_f5','cohort_f6','cohort_f7','cohort_f8','cohort_f9',
         'cohort_f10','cohort_c1','cohort_c2','cohort_c3','cohort_c4',
         'cohort_c5','cohort_c6','cohort_c7','cohort_c8','cohort_c9',
         'cohort_c10')
merge <- c('spouse_1','spouse_2','spouse_3','spouse_4','spouse_5',
           'spouse_6','spouse_7','spouse_8','spouse_9','spouse_10',
           'fratri1','fratri2','fratri3','fratri4','fratri5',
           'fratri6','fratri7','fratri8','fratri9','fratri10',
           'child_1_1','child_1_2','child_1_3','child_1_4','child_1_5','child_1_6',
           'child_1_7','child_1_8','child_1_9','child_1_10')

for (i in 1:length(cols)) {
  colnames(temp) <- c('no',cols[i])
  gen2 <- merge(gen2,temp,by.x=merge[i],by.y='no',all.x=T)
  # for kids, subtract marriage age from parent's age
   if (i > length(cols)-10) {
     gen2[cols[i]] <- gen2[cols[i]] - (age_m)
   }
}

gen2$cohort <- ifelse(is.na(gen2$cohort)==T,
                            rowMeans(subset(gen2, select = c(which(colnames(gen2)==cols[1]):
                                                   which(colnames(gen2)==cols[length(cols)]))),
                            na.rm = T),
                            gen2$cohort)

gen2$cohort_type <- ifelse(is.na(gen2$cohort_type)==F,gen2$cohort_type,
  ifelse(is.na(gen2$cohort)==F,k,NA))

gen2 <- subset(gen2,
  select=-c(which(colnames(gen2)==cols[1]):which(colnames(gen2)==cols[length(cols)])))

}

table(gen2$cohort_type)/dim(gen2)[1]
length(na.omit(gen2$cohort_type))/dim(gen2)[1]


## create a 25- and 15-year cohort var

gen2$cohort_g <- cut(gen2$cohort, breaks = seq(1600, 2025, 25), dig.lab = 4)
gen2$cohort_g15 <- cut(gen2$cohort, breaks = seq(1600, 2020, 15), dig.lab = 4)
gen2$cohort_g10 <- cut(gen2$cohort, breaks = seq(1600, 2020, 10), dig.lab = 4)

length(na.omit(gen2$cohort_g15))/dim(gen2)[1]
table(gen2$cohort_g)


## check for duplicated numbers

gen2 <- subset(gen2, duplicated(gen2$no)==F)



#####
## write to csv
#####

write.csv(gen2,'01_Data/02_Clean/gene_clean.csv')


#####
## put into graph form 
#####

## create a matrix of edges and their attributes

parent_mat <- smartbind(cbind.data.frame('ego'=gen2$no,'parent'=gen2$pere,'cohort25'=gen2$cohort_g, 'cohort15'=gen2$cohort_g15, 'cohort10'=gen2$cohort_g10),
                          cbind.data.frame('ego'=gen2$no,'parent'=gen2$mere,'cohort25'=gen2$cohort_g, 'cohort15'=gen2$cohort_g15, 'cohort10'=gen2$cohort_g10))
parent_mat <- parent_mat[is.na(parent_mat$ego)==F & is.na(parent_mat$parent)==F,]
parent_mat <- parent_mat[order(parent_mat$ego),]


## create a matrix of attributes

vertices <- matrix(unique(rbind(parent_mat$ego,parent_mat$parent)))
colnames(vertices) <- 'ego'

attributes <- cbind.data.frame('ego'=gen2$no,'fname'=gen2$name,
                               'sex'=gen2$sexe,'birth'=gen2$naissance_y,
                               'death'=gen2$deces_y,'occ'=gen2$occupation,
                               'marriage1'=gen2$marriage1_y,'marriage2'=gen2$marriage2_y,
                               'family'=gen2$last,'cohort'=gen2$cohort,
                               'cohort25'=gen2$cohort_g, 
                               'cohort15'=gen2$cohort_g15,
                               'cohort10'=gen2$cohort_g10)

attributes <- merge(vertices,attributes,by='ego',all.x=T)
attributes <- attributes[order(attributes$ego),]
attributes <- unique(attributes)

## put into graph form

parent_graph <- graph.data.frame(parent_mat,vertices=attributes,directed=TRUE)
summary(parent_graph)


#####
## write graph (edgelist and attributes) to csv
#####

write.graph(parent_graph,'01_Data/02_Clean/gene_parent.graphml',format='graphml')



#####
## marriage relationships
#####

## create a matrix of spouse edges

spouse_mat <- smartbind(cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_1,'cohort25'=gen2$cohort_g, 'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_2,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_3,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_4,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_5,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_6,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_7,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_8,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_9,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])),
                        cbind.data.frame('ego'=gen2$no,'alter'=gen2$spouse_10,'cohort25'=gen2$cohort_g,'cohort15'=gen2$cohort_g15,'cohort10'=gen2$cohort_g10,
                                         'spouse'=rep(1,dim(gen2)[1])))
spouse_mat <- spouse_mat[is.na(spouse_mat$ego)==F & is.na(spouse_mat$alter)==F,]
spouse_mat <- spouse_mat[order(spouse_mat$ego),]

parent_mat <- cbind.data.frame(parent_mat,"parent"=rep(1,dim(parent_mat)[1]))
colnames(parent_mat)[2] <- 'alter'

ps_mat <- merge(parent_mat,spouse_mat,
                by=c('ego','alter','cohort15','cohort25','cohort10'),all=T)
ps_mat$parent[is.na(ps_mat$parent)==T] <- 0
ps_mat$spouse[is.na(ps_mat$spouse)==T] <- 0


## create a matrix of attributes

vertices <- matrix(unique(c(ps_mat$ego,ps_mat$alter)))

colnames(vertices) <- 'ego'

attributes <- cbind.data.frame('ego'=gen2$no,'fname'=gen2$name,
                               'sex'=gen2$sexe,'birth'=gen2$naissance_y,
                               'death'=gen2$deces_y,'occ'=gen2$occupation,
                               'marriage1'=gen2$marriage1_y,'marriage2'=gen2$marriage2_y,
                               'family'=gen2$last,'cohort'=gen2$cohort)

attributes <- merge(vertices,attributes,by='ego',all.x=T)
attributes <- attributes[order(attributes$ego),]
attributes <- unique(attributes)


## check for duplicates

attributes[duplicated(attributes$ego)==T,]


## put into graph form

ps_graph <- graph.data.frame(ps_mat,vertices=attributes,directed=T)
summary(ps_graph)


## write edgelist to csv

write.graph(ps_graph,'01_Data/02_Clean/gene_ps_all.graphml',format='graphml')








  
